package Q17_26_Sparse_Similarity;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map.Entry;
import CtCILibrary.AssortedMethods;
public class QuestionC {
public static class Element implements Comparable<Element> {
public int word;
public int document;
public Element(int w, int d) {
word = w;
document = d;
}
public int compareTo(Element e) {
if (word == e.word) {
return document - e.document;
}
return word - e.word;
}
}
public static HashMap<DocPair, Double> computeSimilarities(HashMap<Integer, Document> documents) {
ArrayList<Element> elements = sortWords(documents);
HashMap<DocPair, Double> similarities = computeIntersections(elements);
adjustToSimilarities(documents, similarities);
return similarities;
}
/* Throw all words into one list, sorting by the word then the document. */
public static ArrayList<Element> sortWords(HashMap<Integer, Document> docs) {
ArrayList<Element> elements = new ArrayList<Element>();
for (Document doc : docs.values()) {
ArrayList<Integer> words = doc.getWords();
for (int word : words) {
elements.add(new Element(word, doc.getId()));
}
}
Collections.sort(elements);
return elements;
}
/* Increment the intersection size of each document pair. */
public static void increment(HashMap<DocPair, Double> similarities, int doc1, int doc2) {
DocPair pair = new DocPair(doc1, doc2);
if (!similarities.containsKey(pair)) {
similarities.put(pair, 1.0);
} else {
similarities.put(pair, similarities.get(pair) + 1);
}
}
/* Adjust the intersection value to become the similarity. */
public static HashMap<DocPair, Double> computeIntersections(ArrayList<Element> elements) {
HashMap<DocPair, Double> similarities = new HashMap<DocPair, Double>();
for (int i = 0; i < elements.size(); i++) {
Element left = elements.get(i);
for (int j = i + 1; j < elements.size(); j++) {
Element right = elements.get(j);
if (left.word != right.word) {
break;
}
increment(similarities, left.document, right.document);
}
}
return similarities;
}
/* Adjust the intersection value to become the similarity. */
public static void adjustToSimilarities(HashMap<Integer, Document> documents, HashMap<DocPair, Double> similarities) {
for (Entry<DocPair, Double> entry : similarities.entrySet()) {
DocPair pair = entry.getKey();
Double intersection = entry.getValue();
Document doc1 = documents.get(pair.doc1);
Document doc2 = documents.get(pair.doc2);
double union = (double) doc1.size() + doc2.size() - intersection;
entry.setValue(intersection / union);
}
}
public static void main(String[] args) {
int numDocuments = 10;
int docSize = 5;
HashMap<Integer, Document> documents = new HashMap<Integer, Document>();
for (int i = 0; i < numDocuments; i++) {
int[] words = AssortedMethods.randomArray(docSize, 0, 10);
ArrayList<Integer> w = Tester.removeDups(words);
System.out.println(i + ": " + w.toString());
Document doc = new Document(i, w);
documents.put(i, doc);
}
HashMap<DocPair, Double> similarities = computeSimilarities(documents);
Tester.printSim(similarities);
}
}